import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
sns.set(style='whitegrid', font_scale=1.3)
%matplotlib inline
In this notebook I try to solve binary classification problem. I want to create model with best params to predict wheteher job ad is real or fake.
To achieve good result I preprocess initial text features and add new ones.
Additionally, I want to find out most important features for my model.
data = pd.read_csv('fake_job_postings.csv', index_col='job_id')
data.shape
data.head(4)
data.info()
(data.isna().sum() / data.shape[0]).sort_values(ascending=False)
data.nunique()
print('fraudulent == 1 count {} or {:.2f}% from all data'.format(
data['fraudulent'].sum(),
data['fraudulent'].sum() / data.shape[0] * 100)
)
Extract country, state and city info from location columns
data['location'] = data['location'].fillna(',,')
data['location'] = data['location'].apply(lambda x: x if len(x.split(',')) >= 3 else x + ',,')
# Add region features
data['country'] = data['location'].apply(lambda x: x.split(',')[0])
data['state'] = data['location'].apply(lambda x: x.split(',')[1])
data['city'] = data['location'].apply(lambda x: x.split(',')[2])
data.drop(columns=['location'], inplace=True)
Extract word count and length fo the whole text description for each text column.
text_columns = ['title', 'company_profile', 'description', 'requirements', 'benefits']
num_col = []
name_col = []
# Добавим признаки длины текста
for col in text_columns:
data['len_' + col] = data[col].fillna('').apply(lambda x: len(x))
data['cnt_' + col] = data[col].fillna('').apply(lambda x: len(x.split()))
num_col.append('len_' + col)
num_col.append('cnt_' + col)
name_col.append(col + ' length')
name_col.append(col + ' count')
data.drop(columns=text_columns, inplace=True)
Let`s explore our data!
positive = data[data['fraudulent'] == 1]
negative = data[data['fraudulent'] == 0]
fraud_c = '#f7714f'
real_c = '#4fe9f7'
fig, axes = plt.subplots(5, 2, figsize=(14, 16))
fig.tight_layout(pad=3.5)
legend = {'cnt' : 'word count', 'len' : 'character length'}
for ax, col, name in zip(axes.flat, num_col, name_col):
ax.hist(positive[positive[col] < positive[col].quantile(0.95)][col], bins=20, alpha=0.5, density=True, color=fraud_c)
ax.hist(negative[negative[col] < negative[col].quantile(0.95)][col], bins=20, alpha=0.5, density=True, color=real_c)
ax.set_title('Distribution of {}'.format(name))
ax.legend(['Fruad', 'Real'])
ax.set_xlim([- negative[col].quantile(0.95) / 70, negative[col].quantile(0.95)])
ax.set_xlabel(legend[col[:3]])
I suppose these features will have a big impact on fraudulent prediction model
cols = ['required_education', 'required_experience', 'employment_type']
legend = {0: 'Real', 1: 'Fraud'}
fig = make_subplots(
rows=1, cols=3,
subplot_titles=cols,
shared_yaxes=True,
)
colors = [real_c, fraud_c]
for idx, name in zip(range(1, 4), cols):
for fraud, color in zip((0, 1), colors):
if name != 'required_education':
show_leg = False
else:
show_leg=True
fig.add_trace(go.Histogram(
x=data[data['fraudulent'] == fraud][name],
histnorm='percent',
name=legend[fraud],
xbins=dict(start=-1, end=6, size=1),
marker_color=color,
opacity=0.8,
showlegend=show_leg
),
1, idx)
fig.update_layout(
height=500, width=1000,
#legend_title_text='Fraud',
yaxis_title_text='Percent',
bargap=0.2,
bargroupgap=0.1,
xaxis = dict(tickmode = 'linear', tick0 = -1, dtick = 1),
)
fig.show()
There is a small difference in required_education. Fraud ads ussually have unspecified in this feature.
Use integer labels. Fill NaN with zero value.
categorical_col = [
'department', 'salary_range', 'employment_type',
'required_experience', 'required_education',
'industry', 'function',
'country', 'state', 'city'
]
for cat in categorical_col:
val = set(data[cat].values)
category_mapping = dict(zip(val, range(len(val))))
data[cat] = data[cat].fillna(np.nan).apply(lambda x: category_mapping[x])
plt.figure(figsize=(18,14))
cor = data.corr()
sns.heatmap(cor, annot=True, cmap=plt.cm.Reds)
plt.show()
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from catboost import CatBoostClassifier
# Class weigths
positive = data[data['fraudulent'] == 1].shape[0]
negative = data[data['fraudulent'] == 0].shape[0]
class_weights = {0: 1, 1: negative / positive}
target = data['fraudulent']
features = data.drop(columns=['fraudulent'])
x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=0.3, random_state=0) # Split the data
scaler = StandardScaler()
scaled = scaler.fit_transform(features)
assert scaled.mean() < 1e-6 and (scaled.std() - 1) < 1e-6
sc_x_train, sc_x_test, sc_y_train, sc_y_test = train_test_split(features, target, test_size=0.3, random_state=0)
# Balance
tmp = x_test
tmp['ans'] = y_test
x_test = pd.concat([tmp, tmp[tmp.ans == 1].sample(5100, replace=True)])
y_test = tmp['ans']
x_test = tmp.drop(columns = ['ans'])
def calculate_metrics(pred, real):
tp = (pred[real == 1] == 1).sum()
fp = (pred[real == 0] == 1).sum()
fn = (pred[real == 1] == 0).sum()
accuracy = (pred == real).sum() / real.shape[0]
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1 = 2 * (precision * recall) / (precision + recall)
return accuracy, precision, recall, f1
def train_and_evaluate_model(classifier, x_train_, y_train_, x_test_, y_test_):
model = classifier
model.fit(x_train_, y_train_)
predicted = model.predict(x_test_)
print(print("Accuracy {} \nPrecision {}\nRecall {}\nF1 {}".format(*calculate_metrics(predicted, y_test_))))
return predicted, model
predicted, LogReg = train_and_evaluate_model(
LogisticRegression(class_weight='balanced', solver='liblinear'),
x_train,
y_train,
x_test,
y_test
)
Good accuracy, but F1 is only 0.22
plt.figure(figsize=(12, 4))
coef = pd.DataFrame(columns=['cat', 'coef'])
coef['cat'] = data.drop(columns=['fraudulent']).columns.values
coef['coef'] = abs(LogReg.coef_[0])
coef.sort_values(by='coef', inplace=True, ascending=False)
plt.bar(coef['cat'], coef['coef'], color=fraud_c, alpha=0.5)
plt.title('Log scaled feature importance from LogReg')
plt.yscale('log')
plt.xticks(rotation=-90)
plt.show()
The most significant impact from logo, telecomuting, has_questions and required_experience.
# Catboost
catboost_pred, catboost_model = train_and_evaluate_model(
CatBoostClassifier(
num_trees = 100,
verbose=0,
class_weights=list(class_weights.values())
),
x_train,
y_train,
x_test,
y_test
)
# XGBoost
predicted, XGB_model = train_and_evaluate_model(
XGBClassifier(
num_trees=100,
class_weights=list(class_weights.values())
),
x_train,
y_train,
x_test,
y_test
)
We have achieved 0.98 accuracy and 0.80 F1 score
plt.figure(figsize=(12, 4))
coef = pd.DataFrame(columns=['cat', 'coef'])
coef['cat'] = data.drop(columns=['fraudulent']).columns.values
coef['coef'] = XGB_model.feature_importances_
coef.sort_values(by='coef', inplace=True, ascending=False)
plt.bar(coef['cat'], coef['coef'], color=fraud_c, alpha=0.5)
plt.title('Feature importance from XGBoost')
plt.xticks(rotation=-90)
plt.show()
Finally, we got slightly different feature importance. The most important features for boosting were info about company (company_profile, has_company_logo, cnt_company_profile), as well as required experience. As we have seen before required_experience have had a different distribution for fraud and real ads.
params = {'num_trees': [75, 100, 150], 'depth': [4, 6], 'verbose': [0]}
grid_search = GridSearchCV(XGBClassifier(), params)
grid_search.fit(x_train, y_train)
print("Accuracy {} \nPrecision {}\nRecall {}\nF1 {}".format(
*calculate_metrics(grid_search.best_estimator_.predict(x_test), y_test))
)
Unfortunately, we haven`t improved score. So, I suppose initial params were good enought.
I tried Logistic Regression as a baseline. The model`s performance: 0.78 accuracy, 0.22 F1.
The best score was from XGB_classifier (hahaha gradient boosting go brrrrr...). The model`s performance: 0.98 accuracy, 0.80 F1.
Fake ads ussually have poor company description and unspecify required experience.